******************************** 
** First file to prepare data **
******************************** 

clear all
set more off

mkdir NBER
mkdir USPTO
mkdir Compustat
mkdir other

******************************************************************************
** Patent characteristics -- tech class, assignee, industry, state, circuit **
******************************************************************************

use "\NBER\pat76_06_assg.dta" , clear
keep patent pdpass gyear appyear 
duplicates drop 
save "pat_gappyr.dta", replace


use "\NBER\pat76_06_assg.dta", clear
gen location = state
replace location = "X" + country if location==""
lab var location "State, unless overseas in which case country"
drop asscode
rename appyear year
rename state state_code 

merge m:1 state_code year using "fedapp_yrs.dta"
keep if _merge == 3
 // drop non-US assignees 
drop _merge 
drop if mi(circuit)

merge m:1 pdpass using "\NBER\dynass.dta"
tab _m
keep if _m == 3 

gen gvkey =.
forvalue i = 1/5 {
replace gvkey = gvkey`i' if gvkey`i'~=. & year >= begyr`i' & year <= endyr`i'
}
drop if mi(gvkey)
drop _merge
drop begyr1-endyr5
merge m:1 gvkey year using "\Compustat\gvkey-sic.dta"
keep if _merge == 3
///drop gvkeys without Compustat financials
drop _merge  
 
savesome patent pdpass state_code gvkey year circuit sic sic2 nclass subcat ///
  using "pat_gvkey_ct.dta", replace 

  
*************************************** 
** Dominant nclass of parent company **
*************************************** 
*Two criteria: >50% & ranked highest 
use "pat_gvkey_ct.dta", clear
gen n = 1 
bysort gvkey nclass year: egen nsum = sum(n)

foreach j in 3 5{          
forvalues i = 1976/2006{
 preserve
 keep if year > `i' - `j' & year <= `i'
 bysort gvkey nclass: egen patsum_`j' = sum(nsum)
 bysort gvkey: egen patall_`j' = sum(nsum)
 gen patsum_rate_`j' = patsum_`j'/patall_`j'
 keep gvkey nclass patsum_`j' patall_`j' patsum_rate_`j'
 duplicates drop 
 gen year = `i'
 tempfile pat`j'_`i'
 save `pat`j'_`i'', replace 
 restore
}  
}

foreach j in 3 5{          
 use `pat`j'_1976', clear

 forvalues i = 1977/2006{
 append using `pat`j'_`i''
 }
 bysort gvkey year: egen patmax = max(patsum_`j')
 gen dtnclass_`j' = cond(patsum_`j' == patmax, cond(patsum_rate_`j' > 0.5, 1, 0), 0)
 replace dtnclass_`j' = . if patsum_rate_`j' == .
 gen dtn_`j' = nclass if dtnclass_`j' == 1
 bysort gvkey year: egen temp = mean(dtn_`j')
 replace dtn_`j' = temp if mi(dtn_`j')
 drop temp
 keep gvkey nclass year dtnclass_`j' patsum_`j' patsum_rate_`j' dtn_`j'
 duplicates drop  
 tempfile patmax_`j'
 save `patmax_`j'', replace
}

merge 1:1 gvkey nclass year using `patmax_3', nogen
merge 1:1 gvkey nclass year using `patmax_5', nogen
 
label var patsum_3 "Patents at t-2, t-1 and t at one nclass"
label var patsum_5 "Patents at t-4, t-3, t-2, t-1 and t at one nclass"
label var dtnclass_3 "Dominant tech class for all patents at t-2, t-1 and t or not"
label var dtnclass_5 "Dominant tech class for all patents at t-4, t-3, t-2, t-1 and t or not"
label var patsum_rate_3 "Percent of patents filed at one nclass at (t-3, t]"
label var patsum_rate_5 "Percent of patents filed at one nclass at (t-5, t]"

order gvkey nclass year 
sort gvkey nclass year 

label data "Gvkey-nclass-year panel"
compress
save "gvkey_nclassmax.dta", replace 


************************************************* 
** Dominant nclass of each business (assignee) **
************************************************* 
* Businesses might be classified with same pdpass but matched to different 
* Gvkey due to transfer of ownership

use "pat_gvkey_ct.dta", clear
gen n = 1 
bysort gvkey pdpass nclass year: egen nsum = sum(n)

foreach j in 3 5{          
forvalues i = 1976/2006{
 preserve
 keep if year > `i' - `j' & year <= `i'
 bysort gvkey pdpass nclass: egen pd_patsum_`j' = sum(nsum)
 bysort gvkey pdpass: egen pd_patall_`j' = sum(nsum)
 gen pd_patsum_rate_`j' = pd_patsum_`j'/pd_patall_`j'
 keep gvkey pdpass nclass pd_patsum_`j' pd_patall_`j' pd_patsum_rate_`j'
 duplicates drop 
 gen year = `i'
 tempfile pat`j'_`i'
 save `pat`j'_`i'', replace 
 restore
}  
}

foreach j in  3 5{          
 use `pat`j'_1976', clear

 forvalues i = 1977/2006{
 append using `pat`j'_`i''
 }
 bysort gvkey pdpass year: egen patmax = max(pd_patsum_`j')
 gen pd_dtnclass_`j' = cond(pd_patsum_`j' == patmax, cond(pd_patsum_rate_`j' > 0.5, 1, 0), 0)
 replace pd_dtnclass_`j' = . if pd_patsum_rate_`j' == .
 gen pd_dtn_`j' = nclass if pd_dtnclass_`j' == 1
 bysort gvkey pdpass year: egen temp = mean(pd_dtn_`j') 
 replace pd_dtn_`j' = temp if mi(pd_dtn_`j')
 drop temp 
 keep pdpass gvkey nclass year pd_dtnclass_`j' pd_patsum_`j' pd_patsum_rate_`j' pd_dtn_`j'
 duplicates drop  
 tempfile patmax_`j'
 save `patmax_`j'', replace
}

use `patmax_3', clear 
merge 1:1 gvkey pdpass nclass year using `patmax_5', nogen
 
label var pd_patsum_3 "Patents at t-2, t-1 and t (business)"
label var pd_patsum_5 "Patents at t-4, t-3, t-2, t-1 and t (business)"
label var pd_dtnclass_3 "Dominant tech class for all patents at t-2, t-1 and t (business) or not"
label var pd_dtnclass_5 "Dominant tech class for all patents at t-4, t-3, t-2, t-1 and t (business) or not"
label var pd_patsum_rate_3 "Percent of patents filed at one nclass at (t-3, t] (business)"
label var pd_patsum_rate_5 "Percent of patents filed at one nclass at (t-5, t] (business)"

order pdpass gvkey nclass year 
sort pdpass  gvkey nclass year 
label data "gvkey-pdpass-nclass-year panel"
compress
save "pdpass_nclassmax.dta", replace 

**********************
**patent citations**
********************
use "\NBER\cite76_06.dta", clear
rename cited patent 
joinby patent using "pat_gappyr.dta"
rename gyear cited_gyr 
rename appyear cited_appyr
rename pdpass cited_pdpass 
rename patent cited 

rename citing patent 
joinby patent using "pat_gappyr.dta" 
rename gyear citing_gyr 
rename appyear citing_appyr
rename pdpass citing_pdpass 
rename patent citing

gen self_cite = cond(citing_pdpass == cited_pdpass, 1, 0)
gen gap = citing_appyr - cited_appyr 
gen threeyr = cond(gap <= 3, 1, 0 )

preserve
keep if self_cite == 0
keep if threeyr == 1
bysort cited: gegen cites_others_3yr = nvals(citing)
keep cited cites_others_3yr
duplicates drop 
rename cited patent 
save "ncites_3yr.dta", replace 
restore

preserve 
keep if self_cite == 1
keep if threeyr == 1
bysort cited: gegen cites_self_3yr = nvals(citing)
keep cited cites_self_3yr
duplicates drop 
rename cited patent 
save "selfncites_3yr.dta", replace 
restore 

*******************************************************************
** Calculate number of patents in each circuit for each business **
** in the dominant technology domain of the parent company       **     
*******************************************************************
use "pat_gvkey_ct.dta", clear
merge m:1 gvkey nclass year using "gvkey_nclassmax.dta"
keep if _merge == 3
drop _merge 

foreach i in  3 5{
bysort pdpass circuit year: egen npatdn_`i' = sum(dtnclass_`i'), missing
label var npatdn_`i' "Patents in the dominant class identified from (t-`i', t] of the parent company filed by the business"
}

keep gvkey pdpass circuit year npatdn_*  dtn_*
duplicates drop 
tempfile temp
save `temp', replace 

use "pat_gvkey_ct.dta", clear
merge m:1 gvkey pdpass nclass year using "pdpass_nclassmax.dta"
keep if _merge == 3
drop _merge 

foreach i in 3 5{
 bysort pdpass gvkey circuit year: egen pd_npatdn_`i' = sum(pd_dtnclass_`i'), missing
 label var pd_npatdn_`i' "Patents in the dominant class identified from (t-`i', t] of the business"
}
keep  gvkey pdpass circuit year pd_npatdn_*  pd_dtn_*
duplicates drop 

merge 1:1 gvkey pdpass circuit year using `temp'
drop _merge 

label var pd_dtn_3 "Dominant class defined by busienss patent pool (t-3,t]"
label var pd_dtn_5 "Dominant class defined by busienss patent pool (t-5,t]"
label var dtn_3 "Dominant class defined by parent patent pool (t-3,t]"
label var dtn_5 "Dominant class defined by parent patent pool (t-5,t]"
label data "Business-Circuit-year panel identifying number of patents filed in dominant tech class"

order gvkey pdpass circuit year dtn_3 npatdn_3 dtn_5 npatdn_5 pd_dtn_3 pd_npatdn_3 pd_dtn_5 pd_npatdn_5
foreach i in npatdn_3 npatdn_5 pd_npatdn_3 pd_npatdn_5{
 replace `i' = 0 if mi(`i')
}
compress
save "pd_npatdn.dta", replace 


**************************************
** Patent class by circuit and year **
**************************************

use "pd_npatdn.dta", clear 
foreach i in dtn_3 dtn_5 pd_dtn_3 pd_dtn_5{
preserve 
rename `i' nclass 
drop if mi(nclass)
bysort nclass year: egen `i'_pd = nvals(pdpass)
bysort nclass circuit year: egen  `i'_ct_pd = nvals(pdpass)
keep nclass circuit year `i'_pd `i'_ct_pd
gen `i'_ct_rate = `i'_ct_pd/`i'_pd
duplicates drop 
tempfile temp_`i'
save `temp_`i'', replace 
restore 
}

use `temp_dtn_3', clear  
merge 1:1 nclass circuit year using `temp_dtn_5', nogen 
merge 1:1 nclass circuit year using `temp_pd_dtn_3', nogen 
merge 1:1 nclass circuit year using `temp_pd_dtn_5', nogen 
fillin nclass circuit year 
foreach i of varlist dtn_3_pd-pd_dtn_5_ct_rate{
 replace `i' = 0 if mi(`i')
}
drop _fillin

label data "nclass-circuit-year panel describing the geographical distribution of businesses working on the same dominant technology class" 

label var dtn_3_pd "Total n of businesses patented in this class as dominant tech class identified by parent company (t-2, t]"
label var dtn_3_ct_pd "N of businesses patented in this class as dominant tech class at this circuit identified by parent company (t-2, t]"
label var dtn_3_ct_rate "Percent of businesses patented in this class as dominant tech class at this circuit identified by parent company (t-2, t]"

label var dtn_5_pd "Total n of businesses patented in this class as dominant tech class identified by parent company (t-4, t]"
label var dtn_5_ct_pd "N of businesses patented in this class as dominant tech class at this circuit identified by parent company (t-4, t]"
label var dtn_5_ct_rate "Percent of businesses patented in this class as dominant tech class at this circuit identified by parent company (t-4, t]"

label var pd_dtn_5_pd "Total n of businesses patented in this class as dominant tech class identified by businesses (t-4, t]"
label var pd_dtn_5_ct_pd "N of businesses patented in this class as dominant tech class at this circuit identified by businesses (t-4, t]"
label var pd_dtn_5_ct_rate "Percent of businesses patented in this class as dominant tech class at this circuit identified by businesses (t-4, t]"

label var pd_dtn_3_pd "Total n of businesses patented in this class as dominant tech class identified by businesses (t-2, t]"
label var pd_dtn_3_ct_pd "N of businesses patented in this class as dominant tech class at this circuit identified by businesses (t-2, t]"
label var pd_dtn_3_ct_rate "Percent of businesses patented in this class as dominant tech class at this circuit identified by businesses (t-2, t]"
compress
save "pd_dtn_ctyr.dta", replace 


********************************************************** 
** New entrants to patent class and circuit by industry **
********************************************************** 

use "pat_gvkey_ct.dta", clear
drop if mi(nclass)
bysort pdpass circuit nclass (year): gen tclass = _n
gen byte new_entry = cond(tclass == 1,  1, 0)
bysort nclass circuit sic year: egen new_sic_nclass_ct = sum(new_entry)
bysort nclass circuit year: egen new_nclass_ct = sum(new_entry)
bysort pdpass nclass circuit year: egen new_pd_nclass_ct = sum(new_entry)
keep pdpass nclass circuit sic year new_sic_nclass_ct new_nclass_ct new_pd_nclass_ct
*compile a pdpass-nclass-circuit-year panel 
duplicates drop

gen new_sic_entry = new_sic_nclass_ct - new_pd_nclass_ct
gen new_entry = new_nclass_ct - new_pd_nclass_ct

bysort pdpass nclass circuit (year): gen new_entry_f3 = new_entry[_n+1] + ///
 new_entry[_n+2] + new_entry[_n+3]
bysort pdpass nclass circuit (year): gen new_entry_f5 = new_entry[_n+1] + ///
 new_entry[_n+2] + new_entry[_n+3] + new_entry[_n+4] + new_entry[_n+5]
bysort pdpass nclass circuit (year): gen new_sic_entry_f3 = new_sic_entry[_n+1] + ///
 new_sic_entry[_n+2] + new_sic_entry[_n+3]
bysort pdpass nclass circuit (year): gen new_sic_entry_f5 = new_sic_entry[_n+1] + ///
 new_sic_entry[_n+2] + new_sic_entry[_n+3] + new_sic_entry[_n+4] + new_sic_entry[_n+5]
  
preserve
use "pd_npatdn.dta" , clear 
rename dtn_3 nclass 
tempfile temp 
save `temp', replace 
restore 

merge 1:1 pdpass circuit nclass year using `temp' 
drop gvkey-pd_npatdn_5
drop if _merge == 2
gen  dtn_3 = 1 if _merge == 3
replace dtn_3 = 0 if _merge == 1 
drop _merge 

preserve
use "pd_npatdn.dta" , clear 
rename dtn_5 nclass 
tempfile temp 
save `temp', replace 
restore 

merge 1:1 pdpass circuit nclass year using `temp' 
drop gvkey-pd_npatdn_5
drop if _merge == 2
gen  dtn_5 = 1 if _merge == 3
replace dtn_5 = 0 if _merge == 1 
drop _merge 

preserve
use "pd_npatdn.dta" , clear 
rename pd_dtn_3 nclass 
tempfile temp 
save `temp', replace 
restore 

merge 1:1 pdpass circuit nclass year using `temp' 
drop gvkey-pd_npatdn_5
drop if _merge == 2
gen  pd_dtn_3 = 1 if _merge == 3
replace pd_dtn_3 = 0 if _merge == 1 
drop _merge 

preserve
use "pd_npatdn.dta" , clear 
rename pd_dtn_5 nclass 
tempfile temp 
save `temp', replace 
restore 

merge 1:1 pdpass circuit nclass year using `temp' 
drop gvkey-pd_npatdn_5
drop if _merge == 2
gen  pd_dtn_5 = 1 if _merge == 3
replace pd_dtn_5 = 0 if _merge == 1 
drop _merge 

label var pd_dtn_3 "Dominant class defined by business patent pool (t-3,t]"
label var pd_dtn_5 "Dominant class defined by business patent pool (t-5,t]"
label var dtn_3 "Dominant class defined by parent patent pool (t-3,t]"
label var dtn_5 "Dominant class defined by parent patent pool (t-5,t]"

foreach i in dtn_3 dtn_5 pd_dtn_3 pd_dtn_5{
gen new_sic_entry_`i' = new_sic_entry if `i' == 1
replace new_sic_entry_`i' = 0 if `i' == 0
gen new_entry_`i' = new_nclass_ct - new_pd_nclass_ct if `i' == 1
replace new_entry_`i' = 0 if `i' == 0 
 }
 
* data is organized as business-circuit-nclass-year, now construct
* panel by business-circuit-year 

gcollapse (mean) newentry_mean = new_entry ///
  newentry_mean_f3 = new_entry_f3 ///
  new_sic_entry_mean = new_sic_entry  ///
  new_sic_entry_mean_f3 = new_sic_entry_f3  ///
  (sum) newentry_sum = new_entry ///
  newentry_sum_f3 = new_entry_f3 ///
  newentry_sic_sum = new_sic_entry /// 
  newentry_sic_sum_f3 = new_sic_entry_f3  ///
  (sum) new_sic_entry_dtn_3 new_sic_entry_dtn_5 new_sic_entry_pd_dtn_3 ///
  new_sic_entry_pd_dtn_5 new_entry_dtn_3 new_entry_dtn_5 new_entry_pd_dtn_3 new_entry_pd_dtn_5,   by(pdpass circuit year)
  
compress
save "newentry_pdct.dta", replace


***************************** 
** Circuit characteristics **
***************************** 

use "\other\tsayr-public-20151120.dta", clear

merge m:1  state using "state_to_statecode.dta", nogen
merge 1:1 state_code year using "fedapp_yrs.dta"
keep if _merge == 3
drop _merge 

bysort circuit year: gegen utsan_mean = mean(utsan)
bysort circuit year: gegen idxe6cl_mean = mean(idxe6cl)
 
merge 1:1 state year using "\other\indy.dta" 
keep if _merge == 3 
drop _merge 

bysort circuit year: gegen va1_sum = sum(va1)
label var va1_sum "All industry total"
foreach i in va3 va6 va11 va12 va13 va36 va47 va48 va49 va57 va72 va76 va77 va78{
 bysort circuit year: gegen `i'_sum = sum(`i')
 gen `i'_perct = `i'_sum/va1_sum
}
foreach j in sum perct{
 label var va3_`j' "Agriculture"
 label var va6_`j' "Mining"
 label var va11_`j' "Construction"
 label var va12_`j' "Manufacturing"
 label var va13_`j' "Durables manufacturing"
 label var va36_`j' "Transportation"
 label var va47_`j' "Wholesale trade"
 label var va48_`j' "Retail trade"
 label var va49_`j' "Finance, insurance and real estate"
 label var va57_`j' "Services"
 label var va72_`j' "Government"
 label var va76_`j' "Electronic equipment and instruments"
 label var va77_`j' "Depository and nondepository institutions"
 label var va78_`j' "Business services and other services"
}

gen perct = va1/va1_sum
gen temp1 = utsan * perct
gen temp2 = idxe6cl *  perct 

bysort circuit year: gegen utsan_perct_ct = sum(temp1)
bysort circuit year: gegen idxe6cl_perct_ct = sum(temp2)
bysort circuit year: gegen utsan_sum_ct = sum(utsan)
bysort circuit year: gegen idxe6cl_sum_ct = sum(idxe6cl)

keep circuit year utsan_mean-idxe6cl_mean utsan_perct_ct idxe6cl_perct_ct ///
  utsan_sum_ct idxe6cl_sum_ct va1_sum-va78_perct 
  
label var utsan_mean "Circuit level average utsan"
label var idxe6cl_mean "Circuit level average idxe6cl"
label var utsan_perct_ct "Weighted sum (GDP) circuit level utsan"
label var idxe6cl_perct_ct "Weighted sum (GDP) circuit level idxe6cl"
label var utsan_sum_ct "Circuit level sum utsan"
label var idxe6cl_sum_ct "Circuit level sum idxe6cl"

duplicates drop
compress
save "\other\tsayr_ct.dta", replace

**Construct other patent variables at the circuit year level**
***************************************
** Continuation and division patents **
***************************************
use "\USPTO\documentid.dta", clear
rename appno_doc_num application_number
rename grant_doc_num patent 
keep application_number patent
drop if mi(application_number)
*drop if mi(patent)
compress
save "\USPTO\pat_app.dta", replace
 
use "\USPTO\continuity_parents.dta", clear
gen div = 1 if continuation_type == "DIV"
replace div = 0 if mi(div)
gen con = 1 if  continuation_type == "CON"
replace con = 0 if mi(con)
gen cip = 1 if  continuation_type == "CIP"
replace cip = 0 if mi(cip)

keep parent_application_number application_number div con cip 
duplicates drop 
gen source = "parent"
compress
save "\USPTO\condiv1.dta", replace 

use "\USPTO\continuity_children.dta", clear
rename application_number parent_application_number
rename child_application_number application_number

gen div = 1 if continuation_type == "DIV"
replace div = 0 if mi(div)
gen con = 1 if  continuation_type == "CON"
replace con = 0 if mi(con)
gen cip = 1 if  continuation_type == "CIP"
replace cip = 0 if mi(cip)

keep parent_application_number application_number div con cip 
duplicates drop 
gen source = "child"
append using "\USPTO\condiv1.dta"

duplicates drop parent_application_number application_number div con cip, force

duplicates tag parent_application_number application_number, gen(check)
drop if check != 0 & source == "child"
drop check source

bysort application_number: gegen DIV = max(div)
bysort application_number: gegen CON  = max(con)
bysort application_number: gegen CIP  = max(cip)

bysort parent_application_number: gegen DIV_parent = sum(div)
bysort parent_application_number: gegen CON_parent  = sum(con)
bysort parent_application_number: gegen CIP_parent  = sum(cip)
compress

preserve 
keep parent_application_number DIV_parent CON_parent CIP_parent
duplicates drop
save "\USPTO\parent_condiv.dta", replace 
restore

keep application_number DIV CON CIP
duplicates drop 
save "\USPTO\pat_condiv.dta", replace 

*Locate patent number by patent application number
use application_number patent_number using "\USPTO\application_data"
merge 1:1 application_number using "\USPTO\pat_condiv.dta"
keep if _merge == 3
drop _merge 

egen patent = sieve(patent_number), keep(numeric)
destring patent, replace 
drop if mi(patent)
duplicates tag patent, gen(check)
*br if check != 0  
drop if patent_number == "RE30110"
drop if patent_number == "RE32505"
drop check
compress
save "\USPTO\pat_patapp.dta", replace 

use application_number patent_number using "\USPTO\application_data"
rename application_number parent_application_number
merge 1:1 parent_application_number using "\USPTO\parent_condiv.dta"
keep if _merge == 3
drop _merge 
rename patent_number patent 

egen patent = sieve(patent_number), keep(numeric)
destring patent, replace 
drop if mi(patent)
compress
save "\USPTO\parentpat_patapp.dta", replace 


***************************************************
** Construct business-circuit level patent count **
***************************************************
use "pat_gvkey_ct.dta", clear
merge m:1 patent using "\USPTO\pat_patapp.dta"
drop if _merge == 2
foreach i of varlist DIV CON CIP{
 replace `i' = 0 if mi(`i')
}
drop _merge application_number patent_number 

egen strt_yes = rowmax(DIV CON CIP)

merge m:1 patent using "\USPTO\parentpat_patapp.dta"
drop if _merge == 2
foreach i  of varlist DIV_parent-CIP_parent{
 replace `i' = 0 if mi(`i')
}
foreach i in DIV CON CIP{
 gen `i'parent_yes  = cond(`i' == 0 , 0 , 1)
}
drop _merge parent_application_number patent_number

gen parent_yes = 1 if CONparent_yes == 1 | CIPparent_yes == 1 | DIVparent_yes == 1
replace parent_yes = 0 if CONparent_yes == 0 & CIPparent_yes == 0 & ///
  DIVparent_yes == 0
  
gen inde_pat = 1 if DIV == 0 & CON == 0 & CIP == 0 & DIV_parent == 0 & ///
 CIP_parent == 0 & DIV_parent == 0
replace inde_pat = 0 if mi(inde_pat) 

gcollapse (sum) inde_pat strt_yes parent_yes DIV CON CIP DIV_parent-CIPparent_yes, ///
 by(pdpass circuit year)
foreach i in DIV CON CIP{
 label var `i'_parent "Number of `i' parent patents under all granted patents"
 label var `i'parent_yes "Number of `i' parent patents"
}
label var parent_yes "Number of patents with continuation patents"
compress
save "\USPTO\condivpat_pdct.dta", replace 


************************************************* 
** Calculate number of single-assignee patents **
************************************************* 
use "\NBER\pat76_06_assg.dta", clear
gen location = state
replace location = "X" + country if location==""
lab var location "State, unless overseas in which case country"
drop asscode
rename appyear year
rename state state_code 

bysort patent: gegen n_ass = nvals(pdpass)
keep if n_ass == 1

merge m:1 state_code year using "fedapp_yrs.dta"
keep if _merge == 3
 // drop non-US assignees 
drop _merge 
drop if mi(circuit)
gen n = 1
gcollapse (sum) n , by(pdpass circuit year)
rename n npat_single
save "singleass_npat_pdct.dta", replace 


**************************************************************************
** Number of patent classes and tech subcategories by business and year **
**************************************************************************
use "NBER\pat_gvkey_ct.dta", clear
drop if mi(nclass)
drop if mi(subcat)
bysort pdpass circuit year: egen n_nclass = nvals(nclass)
bysort pdpass circuit year: egen n_subcat = nvals(subcat)
keep pdpass circuit year n_nclass  n_subcat
duplicates drop 
save "nnclass_pdct.dta", replace


****************************************************
**Calculate technological complexity for each business**
****************************************************
** Concept follows that Hall, Graevenitz and Helmers, oep2020
** (1) compute number of patents filed in past ten years in each patent class
** (2) count actual number of citations to the prior patents within patent class 
** (3) calculate ratio between citations made not within the patent pool 
use "\NBER\pat76_06_assg.dta", clear
rename appyear year 
keep patent subcat year 
bysort subcat: egen pat_ct_subcat = nvals(patent)
duplicates drop 
save "subcatpat_all.dta", replace 
keep if year <= 1983 
save "subcatpat_precafc.dta", replace 

use "\NBER\cite76_06.dta", clear
rename citing patent
merge m:1 patent using "subcatpat_precafc.dta"
drop if _merge == 1
drop _merge 

rename subcat subcat_citing
rename patent citing
rename year year_citing
rename pat_ct_subcat pat_ct_subcat_citing 

rename cited patent
merge m:1 patent using "subcatpat_all.dta"
drop if _merge == 2
drop _merge 
rename subcat subcat_cited 
rename patent cited
rename year year_cited 
rename pat_ct_subcat pat_ct_subcat_cited

gen n = 1 if !mi(citing) & !mi(cited)
gen nsame = 1 if !mi(citing) & !mi(cited) & subcat_cited == subcat_citing 
bysort subcat_citing: egen cited_tt_samecat = sum(nsame)
bysort subcat_citing: egen cited_tt = sum(n)

keep subcat_citing cited_tt cited_tt_samecat pat_ct_subcat_citing
duplicates drop 

gen cites_max = pat_ct_subcat_citing * (pat_ct_subcat_citing-1)
gen comcites_subcat = cited_tt_samecat/pat_ct_subcat_citing
rename subcat_citing subcat 
keep subcat comcites_subcat
drop if mi(subcat)
label var comcites_subcat "Citation percent within the same tech class in the pre-CAFC era"
save "subcat_comcites.dta", replace 

**Generate a variable of overall common citations for each business 
**weighting the comcites variable by percent of patent at each subcat_cited
use "pat_gvkey_ct.dta", clear
keep if year <= 1983 
bysort pdpass subcat: egen npat_subcat_precafc = nvals(patent)
bysort pdpass: egen npat_pdttl_precafc = nvals(patent)
keep pdpass subcat npat_subcat_precafc npat_pdttl_precafc
duplicates drop 
merge m:1 subcat using "subcat_comcites.dta"
keep if _merge == 3
drop _merge 

gen temp = npat_subcat_precafc/npat_pdttl_precafc*comcites_subcat
bysort pdpass: egen pd_comcites = sum(temp)
keep pdpass pd_comcites
duplicates drop 
label var pd_comcites "Technological complexity (wt by number of patents) across all subcat before CAFC"
save "pd_comcites.dta", replace 


****************************************************
** Calculate technological complexity by industry **
****************************************************
use "pat_gvkey_ct.dta", clear
keep if year <= 1983 
gen sic2 = int(sic/100)
bysort sic2 subcat: egen npat_subcat_precafc = nvals(patent)
bysort sic2: egen npat_pdttl_precafc = nvals(patent)
keep sic2 subcat npat_subcat_precafc npat_pdttl_precafc
duplicates drop 
merge m:1 subcat using "subcat_comcites.dta"
keep if _merge == 3
drop _merge 

gen temp = npat_subcat_precafc/npat_pdttl_precafc*comcites_subcat
bysort sic2: egen sic2_comcites = sum(temp)
keep sic2 sic2_comcites
duplicates drop 
label var sic2_comcites "Technological complexity (wt by number of patents) across all subcat before CAFC for the industry"
save "sic2_comcites.dta", replace 


******************************************************
** Identify dominant subcat for each parent company **
******************************************************

*Two criteria: >50% & ranked highest 
use "pat_gvkey_ct.dta", clear
gen n = 1 
bysort gvkey subcat year: egen nsum = sum(n)

foreach j in 3 5{          
forvalues i = 1976/2006{
 preserve
 keep if year > `i' - `j' & year <= `i'
 bysort gvkey subcat: egen patsum_`j' = sum(nsum)
 bysort gvkey: egen patall_`j' = sum(nsum)
 gen patsum_rate_`j' = patsum_`j'/patall_`j'
 keep gvkey subcat patsum_`j' patall_`j' patsum_rate_`j'
 duplicates drop 
 gen year = `i'
 tempfile pat`j'_`i'
 save `pat`j'_`i'', replace 
 restore
}  
}

foreach j in 3 5{          
 use `pat`j'_1976', clear

 forvalues i = 1977/2006{
 append using `pat`j'_`i''
 }
 bysort gvkey year: egen patmax = max(patsum_`j')
 gen dtsubcat_`j' = cond(patsum_`j' == patmax, cond(patsum_rate_`j' > 0.5, 1, 0), 0)
 replace dtsubcat_`j' = . if patsum_rate_`j' == .
 gen dts_`j' = subcat if dtsubcat_`j' == 1
 bysort gvkey year: egen temp = mean(dts_`j')
 replace dts_`j' = temp if mi(dts_`j')
 drop temp
 keep gvkey subcat year dtsubcat_`j' patsum_`j' patsum_rate_`j' dts_`j'
 duplicates drop  
 tempfile patmax_`j'
 save `patmax_`j'', replace
}

merge 1:1 gvkey subcat year using `patmax_3', nogen
merge 1:1 gvkey subcat year using `patmax_5', nogen
 
label var patsum_3 "Patents at t-2, t-1 and t at one nclass"
label var patsum_5 "Patents at t-4, t-3, t-2, t-1 and t at one nclass"
label var dtsubcat_3 "Dominant tech class for all patents at t-2, t-1 and t or not"
label var dtsubcat_5 "Dominant tech class for all patents at t-4, t-3, t-2, t-1 and t or not"
label var patsum_rate_3 "Percent of patents filed at one nclass at (t-3, t]"
label var patsum_rate_5 "Percent of patents filed at one nclass at (t-5, t]"

order gvkey subcat year 
sort gvkey subcat year 

rename patsum_3 patsum_subcat_3
rename patsum_5 patsum_subcat_5
rename patsum_rate_3 patsum_rate_subcat_3 
rename patsum_rate_5 patsum_rate_subcat_5

label data "Gvkey-subcat-year panel"
compress
save "gvkey_subcatmax.dta", replace 


**********************************************
**Identify dominant subcat for each business**
**********************************************
**Businesses might be subject to the same pdpass but matched to different 
*Gvkey due to transfer of ownership
use "pat_gvkey_ct.dta", clear
gen n = 1 
bysort gvkey pdpass subcat year: egen nsum = sum(n)

foreach j in 3 5{          
forvalues i = 1976/2006{
 preserve
 keep if year > `i' - `j' & year <= `i'
 bysort gvkey pdpass subcat: egen pd_patsum_`j' = sum(nsum)
 bysort gvkey pdpass: egen pd_patall_`j' = sum(nsum)
 gen pd_patsum_rate_`j' = pd_patsum_`j'/pd_patall_`j'
 keep gvkey pdpass subcat pd_patsum_`j' pd_patall_`j' pd_patsum_rate_`j'
 duplicates drop 
 gen year = `i'
 tempfile pat`j'_`i'
 save `pat`j'_`i'', replace 
 restore
}  
}

foreach j in  3 5{          
 use `pat`j'_1976', clear

 forvalues i = 1977/2006{
 append using `pat`j'_`i''
 }
 bysort gvkey pdpass year: egen patmax = max(pd_patsum_`j')
 gen pd_dtsubcat_`j' = cond(pd_patsum_`j' == patmax, cond(pd_patsum_rate_`j' > 0.5, 1, 0), 0)
 replace pd_dtsubcat_`j' = . if pd_patsum_rate_`j' == .
 gen pd_dts_`j' = subcat if pd_dtsubcat_`j' == 1
 bysort gvkey pdpass year: egen temp = mean(pd_dts_`j') 
 replace pd_dts_`j' = temp if mi(pd_dts_`j')
 drop temp 
 keep pdpass gvkey subcat year pd_dtsubcat_`j' pd_patsum_`j' pd_patsum_rate_`j' pd_dts_`j'
 duplicates drop  
 tempfile patmax_`j'
 save `patmax_`j'', replace
}

use `patmax_3', clear 
merge 1:1 gvkey pdpass subcat year using `patmax_5', nogen
 
label var pd_patsum_3 "Patents at t-2, t-1 and t (business)"
label var pd_patsum_5 "Patents at t-4, t-3, t-2, t-1 and t (business)"
rename pd_patsum_3 pd_patsum_subcat_3
rename pd_patsum_5 pd_patsum_subcat_5

label var pd_dtsubcat_3 "Dominant tech class for all patents at t-2, t-1 and t (business) or not"
label var pd_dtsubcat_5 "Dominant tech class for all patents at t-4, t-3, t-2, t-1 and t (business) or not"
label var pd_patsum_rate_3 "Percent of patents filed at one nclass at (t-3, t] (business)"
label var pd_patsum_rate_5 "Percent of patents filed at one nclass at (t-5, t] (business)"
rename pd_patsum_rate_3 pd_patsum_rate_subcat_3
rename pd_patsum_rate_5 pd_patsum_rate_subcat_5

order pdpass gvkey subcat year 
sort pdpass  gvkey subcat year 
label data "gvkey-pdpass-subcat-year panel"
compress
save "pdpass_subcatmax.dta", replace 


********************************************************************
** Compute number of patents in the dominant technology domain   ***
** of the parent company (subcat version) by business and circuit **
********************************************************************
use "pat_gvkey_ct.dta", clear
merge m:1 gvkey subcat year using "gvkey_subcatmax.dta"
keep if _merge == 3
drop _merge 

foreach i in  3 5{
bysort pdpass circuit year: egen npatds_`i' = sum(dtsubcat_`i'), missing
label var npatds_`i' "Patents in the dominant subcat identified from (t-`i', t] of the parent company filed by the business"
}

keep gvkey pdpass circuit year npatds_*  dts_*
duplicates drop 
tempfile temp
save `temp', replace 

use "pat_gvkey_ct.dta", clear
merge m:1 gvkey pdpass subcat year using "pdpass_subcatmax.dta"
keep if _merge == 3
drop _merge 

foreach i in 3 5{
 bysort pdpass gvkey circuit year: egen pd_npatds_`i' = sum(pd_dtsubcat_`i'), missing
 label var pd_npatds_`i' "Patents in the dominant subcat identified from (t-`i', t] of the business"
}
keep  gvkey pdpass circuit year pd_npatds_*  pd_dts_*
duplicates drop 

merge 1:1 gvkey pdpass circuit year using `temp'
drop _merge 

label var pd_dts_3 "Dominant subcat defined by business patent pool (t-3,t]"
label var pd_dts_5 "Dominant subcat defined by business patent pool (t-5,t]"
label var dts_3 "Dominant subcat defined by parent patent pool (t-3,t]"
label var dts_5 "Dominant subcat defined by parent patent pool (t-5,t]"
label data "Business-Circuit-year panel identifying number of patents filed in dominant subcat"

order gvkey pdpass circuit year dts_3 npatds_3 dts_5 npatds_5 pd_dts_3 pd_npatds_3 pd_dts_5 pd_npatds_5
foreach i in npatds_3 npatds_5 pd_npatds_3 pd_npatds_5{
 replace `i' = 0 if mi(`i')
}
compress
save "pd_npatds.dta", replace 


**************************************
**Identify subcat-circuit-year USPTOs**
**************************************
use "pd_npatds.dta", clear 
foreach i in dts_3 dts_5 pd_dts_3 pd_dts_5{
preserve 
rename `i' subcat 
drop if mi(subcat)
bysort subcat year: egen `i'_pd = nvals(pdpass)
bysort subcat circuit year: egen  `i'_ct_pd = nvals(pdpass)
keep subcat circuit year `i'_pd `i'_ct_pd
gen `i'_ct_rate = `i'_ct_pd/`i'_pd
duplicates drop 
tempfile temp_`i'
save `temp_`i'', replace 
restore 
}

use `temp_dts_3', clear  
merge 1:1 subcat circuit year using `temp_dts_5', nogen 
merge 1:1 subcat circuit year using `temp_pd_dts_3', nogen 
merge 1:1 subcat circuit year using `temp_pd_dts_5', nogen 
fillin subcat circuit year 
foreach i of varlist dts_3_pd-pd_dts_5_ct_rate{
 replace `i' = 0 if mi(`i')
}
drop _fillin

label data "subcat-circuit-year panel describing the geographical distribution of businesses working on the same dominant subcat" 

label var dts_3_pd "Total n of businesses patented in this class as dominant tech class identified by parent company (t-2, t]"
label var dts_3_ct_pd "N of businesses patented in this class as dominant tech class at this circuit identified by parent company (t-2, t]"
label var dts_3_ct_rate "Percent of businesses patented in this class as dominant tech class at this circuit identified by parent company (t-2, t]"

label var dts_5_pd "Total n of businesses patented in this class as dominant tech class identified by parent company (t-4, t]"
label var dts_5_ct_pd "N of businesses patented in this class as dominant tech class at this circuit identified by parent company (t-4, t]"
label var dts_5_ct_rate "Percent of businesses patented in this class as dominant tech class at this circuit identified by parent company (t-4, t]"

label var pd_dts_5_pd "Total n of businesses patented in this class as dominant tech class identified by businesses (t-4, t]"
label var pd_dts_5_ct_pd "N of businesses patented in this class as dominant tech class at this circuit identified by businesses (t-4, t]"
label var pd_dts_5_ct_rate "Percent of businesses patented in this class as dominant tech class at this circuit identified by businesses (t-4, t]"

label var pd_dts_3_pd "Total n of businesses patented in this class as dominant tech class identified by businesses (t-2, t]"
label var pd_dts_3_ct_pd "N of businesses patented in this class as dominant tech class at this circuit identified by businesses (t-2, t]"
label var pd_dts_3_ct_rate "Percent of businesses patented in this class as dominant tech class at this circuit identified by businesses (t-2, t]"
compress
save "pd_dts_ctyr", replace 


****************************************************************************
** Identify new entrants to each tech class and circuit from each industry**
****************************************************************************
use "pat_gvkey_ct.dta", clear
drop if mi(subcat)
bysort pdpass circuit subcat (year): gen tcat = _n
gen byte new_entry = cond(tcat == 1,  1, 0)
bysort subcat circuit sic year: egen new_sic_subcat_ct = sum(new_entry)
bysort subcat circuit year: egen new_subcat_ct = sum(new_entry)
bysort pdpass subcat circuit year: egen new_pd_subcat_ct = sum(new_entry)
keep pdpass subcat circuit sic year new_sic_subcat_ct new_subcat_ct new_pd_subcat_ct
*compile a pdpass-subcat-circuit-year panel 
duplicates drop

gen new_sic_entry = new_sic_subcat_ct - new_pd_subcat_ct
gen new_entry = new_subcat_ct - new_pd_subcat_ct

bysort pdpass subcat circuit (year): gen new_entry_f3 = new_entry[_n+1] + ///
 new_entry[_n+2] + new_entry[_n+3]
bysort pdpass subcat circuit (year): gen new_entry_f5 = new_entry[_n+1] + ///
 new_entry[_n+2] + new_entry[_n+3] + new_entry[_n+4] + new_entry[_n+5]
bysort pdpass subcat (year): gen new_sic_entry_f3 = new_sic_entry[_n+1] + ///
 new_sic_entry[_n+2] + new_sic_entry[_n+3]
bysort pdpass subcat circuit (year): gen new_sic_entry_f5 = new_sic_entry[_n+1] + ///
 new_sic_entry[_n+2] + new_sic_entry[_n+3] + new_sic_entry[_n+4] + new_sic_entry[_n+5]
  
preserve
use "pd_npatds.dta" , clear 
rename dts_3 subcat 
tempfile temp 
save `temp', replace 
restore 

merge 1:1 pdpass circuit subcat year using `temp' 
drop gvkey-pd_npatds_5
drop if _merge == 2
gen  dts_3 = 1 if _merge == 3
replace dts_3 = 0 if _merge == 1 
drop _merge 

preserve
use "pd_npatds.dta" , clear 
rename dts_5 subcat 
tempfile temp 
save `temp', replace 
restore 

merge 1:1 pdpass circuit subcat year using `temp' 
drop gvkey-pd_npatds_5
drop if _merge == 2
gen  dts_5 = 1 if _merge == 3
replace dts_5 = 0 if _merge == 1 
drop _merge 

preserve
use "pd_npatds.dta" , clear 
rename pd_dts_3 subcat 
tempfile temp 
save `temp', replace 
restore 

merge 1:1 pdpass circuit subcat year using `temp' 
drop gvkey-pd_npatds_5
drop if _merge == 2
gen  pd_dts_3 = 1 if _merge == 3
replace pd_dts_3 = 0 if _merge == 1 
drop _merge 

preserve
use "pd_npatds.dta" , clear 
rename pd_dts_5 subcat 
tempfile temp 
save `temp', replace 
restore 

merge 1:1 pdpass circuit subcat year using `temp' 
drop gvkey-pd_npatds_5
drop if _merge == 2
gen  pd_dts_5 = 1 if _merge == 3
replace pd_dts_5 = 0 if _merge == 1 
drop _merge 

label var pd_dts_3 "Dominant subcat defined by busienss patent pool (t-3,t]"
label var pd_dts_5 "Dominant subcat defined by busienss patent pool (t-5,t]"
label var dts_3 "Dominant subcat defined by parent patent pool (t-3,t]"
label var dts_5 "Dominant subcat defined by parent patent pool (t-5,t]"

foreach i in dts_3 dts_5 pd_dts_3 pd_dts_5{
gen new_sic_entry_`i' = new_sic_entry if `i' == 1
replace new_sic_entry_`i' = 0 if `i' == 0
gen new_entry_`i' = new_subcat_ct - new_pd_subcat_ct if `i' == 1
replace new_entry_`i' = 0 if `i' == 0 
 }
 
* the data is organized by business-circuit-nclass-year, 
* now construct panel by business-circuit-year 

gcollapse (mean) newentry_mean = new_entry ///
  newentry_mean_f3 = new_entry_f3 ///
  new_sic_entry_mean = new_sic_entry  ///
  new_sic_entry_mean_f3 = new_sic_entry_f3  ///
  (sum) newentry_sum = new_entry ///
  newentry_sum_f3 = new_entry_f3 ///
  newentry_sic_sum = new_sic_entry /// 
  newentry_sic_sum_f3 = new_sic_entry_f3  ///
  (sum) new_sic_entry_dts_3 new_sic_entry_dts_5 new_sic_entry_pd_dts_3 ///
  new_sic_entry_pd_dts_5 new_entry_dts_3 new_entry_dts_5 new_entry_pd_dts_3 ///
  new_entry_pd_dts_5,   by(pdpass circuit year)
  
foreach i of varlist newentry_mean-newentry_sic_sum_f3{
 rename `i' `i'_subcat 
}

compress
save "newentry_subcat_pdct.dta", replace

*******************************
*** Company characteristics ***
*******************************

use "\Compustat\compustat.dta", clear
desc
summ datadate, format
tab indfmt
destring gvkey, replace

duplicates tag gvkey datadate, gen(dup)
tab dup

preserve
drop if dup==1
restore

/* There are  46838 obs with one duplicate based on gvkey and datadate,
so we can get rid of 46838 obs */
order gvkey datadate dup

preserve
/* Identify and keep all obs for gvkeys that have a duplicate. */
gen keeper=0
levelsof gvkey if dup==1, local(levels)
foreach x of local levels {
replace keeper=1 if `x' == gvkey
}

/* Tabulate dup before and after dropping obs without duplicates
as a logic check. */
tab dup
keep if keeper==1
tab dup

/* Duplicates are mostly for INDFMT's equal to INDL and FS for the
same firm. These are two different reporting formats that, per
COMPUSTAT, allow firms that are non-financial services to also report
in a finacial services format. Per an eyeball inspection of the data,
most of the FS line items are mostly missing values. The goal is to
keep whichever obs has the least missing values */
restore

egen mc= rowmiss(bkvlps- prcc_f)

/* intermediate step to create a variable equal to the smallest missing
value count within each gvkey and datadate */

bys gvkey datadate: egen minmc= min(mc)
gen del= 0
replace del=1 if dup != 0 & gvkey[_n]== gvkey[_n+1] & minmc != mc | ///
dup !=0 & gvkey[_n]== gvkey[_n-1] & minmc != mc

/* first part of OR statement above will tag all obs unless the last
one for the firm is the one that should be deleted. The code after the OR
operator takes care of marking the last obs for deletion within the firm if
that is one that should be deleted */

order gvkey datadate dup mc minmc del
tab indfmt if del==1
list gvkey sic if del==1 & indfmt== "INDL"

/* Per OSHA SIC codes, SIC codes 60-67 are Finance, Insurance,
and Real Estate. Makes sense that these firms have more missing values for
INDL than FS in the indfmt variable. */

destring sic, replace
count if sic > 5999 & sic < 6800
/* 137,605 Division H, Finance, Insurancec, Real Estate firms */
tab indfmt if sic >5999 & sic < 6800
/* but looks like most report their results using INDL and not FS */

count if del==1 & indfmt== "INDL"
drop if del==1

/* Above we dropped 35,927 of the 36,852 duplicates. There are 925
remaining duplicates based on gvkey and datadate based on our
previous calculation. */

duplicates tag gvkey datadate, gen(dup2)
tab dup2

gen tie_gen=0
bys gvkey datadate: replace tie_gen=1 if mc[_n]== mc[_n+1]
/* within gvkey and datadate's, identify tie_gens of missing value sums. This will
tag one of the obs, and here we don't care which we retain since they have
the same number of missing values (unless some are more important than others,
but no reason to think that in this case), so we can drop either one.
Note that this only marks one for deletion and not both */
order gvkey datadate dup dup2 mc minmc del tie_gen

drop dup2
drop if tie_gen==1

/* Now that duplicate datadates have been taken care of, create
a variable equal to the year component of datadate so that an
annual xtset may be done */
gen year= year(datadate)
duplicates tag gvkey year, gen(ydup)
tab ydup

/* While gkvey datadate is now unique, gvkey year is not */
preserve
keep if ydup==1

/* An eyeball inspection shows that these duplicates are for firms
who changed their financial reporting period year-ends. */

list gvkey datadate year fyear in 1/10

/* Note that fyear has been defined by COMPUSTAT to attempt to
compensate for this behavior */
restore

count
count if fyear==.
/* The 723 missing values above will trigger an error if we try to xtset
based on gvkey and fyear now. We therefore replace missing fyear based
on the COMPUSTAT data definition */
replace fyear= year(datadate)-1 if fyear== . & month(datadate) <=5
replace fyear= year(datadate) if fyear==. & month(datadate) >5

/* All 723 missing fyears have been taken care of above. Next, we check
for duplicates */
duplicates tag gvkey fyear, gen(dup4)
tab dup4

/* There are 36/2= 18 duplicates. This is explored below */
list gvkey datadate fyear if dup4==1
/* These appear to be firms that switched financial reporting periods
but ended up with the same fyear based on the old and new year-ends.
Eyeball inspection shows that there are many missing values for one each of
the years associated in each of these obs. We will drop whichever
has the most missing values using a modification of the minmc code
used previously. */

/* The original minmc code needs to be modified since minmc was originally
caluclated based on two observations for firms with the same DATADATE.
Here, these firms switched financial reporting period-ends, but not to the
extent that the FYEAR variable changed. Need to create a variable equal to the
year component for each obs and re-calculate minmc. */

drop minmc
bys gvkey fyear: egen minmc= min(mc)
order gvkey datadate fyear dup mc minmc del tie_gen
replace del=1 if dup4==1 & gvkey[_n]== gvkey[_n+1] & minmc != mc | ///
dup4 ==1 & gvkey[_n]== gvkey[_n-1] & minmc != mc
drop if del==1

/* similar to before, we also need to deal with any potential tie_gens */
bys gvkey fyear: replace tie_gen=1 if mc[_n]== mc[_n+1]
drop if tie_gen==1

xtset gvkey fyear, yearly
drop year
rename fyear year 
count

****emp_sumseg comes from Compustat subunit information****
merge 1:1 gvkey year using "\Compustat\emp_sumseg" 
drop if _merge == 2
drop _merge
replace emp = emp * 1000
replace emp = emp_sumseg if mi(emp) & !mi(emp_sumseg)
*(862 real changes made)
bysort gvkey: ipolate emp year, gen (emp_ipolated)
label variable emp_ipolated "Number of employees (interpolated)"
drop emp_sumseg
/*
replace gdp_defltr = gdp_defltr / 100 
replace gdpi_defltr = gdpi_defltr/ 100
gen at_adj = at/gdp_defltr 
gen ppe_adj = ppent/gdp_defltr 
*/
gen debt = dlc + dltt 
gen debt_current_ratio = dlc / at
gen debt_ratio = debt/at
gen leverage = debt/seq 
gen tobin = (debt + prcc_f*csho) / at 

destring sic,replace
gen sic2=int(sic/100)
gen sic3=int(sic/10)
gen ins_perc = cshi / cstk 
gen ex_fin = fincf / xrd 
gen liquidity = che/ at 

xtset gvkey year
gen change = txditc-l.txditc
gen cash_at = che/at
**Cash and Short-Term Investments
gen free_cash = ivncf + oancf
gen CF_rate = (oibdp-(txt-change)-dvp-dvc)/at
**Operating Income Before Depreciation minus tax plus foreign exchange income loss minus dividends minus
**
label var CF_rate "Free cash flow"
bysort sic2: egen xrd_avg = mean(xrd) 
gen xrd1 = xrd
replace xrd1 = 0 if mi(xrd)  
gen xrd2 = xrd
replace xrd2 = xrd_avg if mi(xrd2) 
label var xrd1 "R&D expenditure (missing to 0)"
label var xrd2 "R&D expenditure (missing to industry average)"
gen byte mi_rd = cond(mi(xrd), 1, 0)

gen rd_int1 = xrd/sale
bysort sic2 : egen rd_median = median(rd_int) 
gen rd_int2 = rd_int 
replace rd_int2 = rd_median if mi(xrd)  
replace rd_int1 = 0 if mi(xrd)
label var rd_int1 "R&D intensity (missing to 0)"
label var rd_int2 "R&D intensity (missing to industry median)"

xtset gvkey year
gen mtb = mkvalt/bkvlps
gen ppenme_ratio = ppenme/at  
gen revt_per = revt/emp_ipolated
gen ebitda_per = ebitda/emp_ipolated
gen ppe_per = ppegt/emp_ipolated
gen capex_ratio = capx/at

gen roa= ebitda / at 
*drop if ppe_per < 0
gen neg_revt = cond(revt < = 0, 1, 0)

winsor2 at revt tobin ppegt roa mtb ppenme_ratio capex_ratio ///
 leverage ebitda revt_per capx    xrd1 xrd2 , replace

xtset gvkey year 
gen ppe_per_inh = asinh(ppe_per)
gen revt_per_inh = asinh(revt_per)
gen capx_inh = asinh(capx)
gen at_ln = ln(at)
gen tobin_ln = ln(tobin)
gen xrd1_inh = asinh(xrd1)
gen emp_ln = ln(emp_ipolated) 
gen emp_inh = asinh(emp_ipolated)
gen defense1 = inlist(sic, 3480, 3760)
gen defense2 = inlist(sic, 3480, 3510, 3720, 3721, 3724, 3728, 3760, 3812)

keep gvkey-year at ebitda ebit ppegt revt xrd xad xsga ///
 xstf xstfws xt naicsh sich rank au auop auopic addzip busdesc city /// 
 emp conml county naics spcindcd sic spcseccd spcsrc state /// 
 ipodate emp_ipolated-defense2 naics sic cik ivncf oancf che defense1 defense2 ///
 dvdnp cdvc dv dvc dvdnp dvintf dvp dvpa dvpd dvt dvsco dvrpiv dvpdp

gen div_yes = cond(dvt != 0, 1, 0)
label var div_yes "Pay any dividends"

rename state state_code
merge m:1 state_code year using "fedapp_yrs.dta"
drop if _merge == 2
drop _merge 
rename circuit cmp_circuit 
rename state_code cmp_state
save "cmpst_proed.dta", replace
